{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ler dados de um arquivo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['uf', 'cidade', 'lat', 'lng', 'cond'], ['AC', 'Acrelândia', '-9.825808', '-66.897166', 'false'], ['AC', 'Assis Brasil', '-10.9409203', '-69.5672108', 'false'], ['AC', 'Brasiléia', '-11.0012764', '-68.7487974', 'false'], ['AC', 'Bujari', '-9.8309656', '-67.9520886', 'false']]\n"
     ]
    }
   ],
   "source": [
    "lines = sc.textFile(\"municipios_do_Brasil.csv\")\n",
    "\n",
    "data = lines.map(lambda line: line.split(\",\"))\n",
    "\n",
    "print(data.take(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Criando um DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.csv(\"municipios_do_Brasil.csv\", header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+------------+------------+-----+\n",
      "| uf|              cidade|         lat|         lng| cond|\n",
      "+---+--------------------+------------+------------+-----+\n",
      "| AC|          Acrelândia|   -9.825808|  -66.897166|false|\n",
      "| AC|        Assis Brasil| -10.9409203| -69.5672108|false|\n",
      "| AC|           Brasiléia| -11.0012764| -68.7487974|false|\n",
      "| AC|              Bujari|  -9.8309656| -67.9520886|false|\n",
      "| AC|            Capixaba| -10.5729683| -67.6760894|false|\n",
      "| AC|     Cruzeiro do Sul|  -7.6307956| -72.6703869|false|\n",
      "| AC|      Epitaciolândia| -11.0289439| -68.7411519|false|\n",
      "| AC|               Feijó|  -8.1639128| -70.3541781|false|\n",
      "| AC|              Jordão|  -9.4338167| -71.8843997|false|\n",
      "| AC|         Mâncio Lima|  -7.6137775| -72.8964167|false|\n",
      "| AC|       Manoel Urbano|  -8.8389428| -69.2601292|false|\n",
      "| AC|Marechal Thaumaturgo|  -8.9407511|   -72.79151|false|\n",
      "| AC|   Plácido de Castro| -10.2759758| -67.1500664|false|\n",
      "| AC|          Porto Acre|  -9.5879722|   -67.53307|false|\n",
      "| AC|        Porto Walter|  -8.2687722| -72.7444458|false|\n",
      "| AC|          Rio Branco|-9.976536213|-67.82207776| true|\n",
      "| AC|     Rodrigues Alves|  -7.7417936| -72.6473894|false|\n",
      "| AC| Santa Rosa do Purus|  -9.4338948| -70.4909019|false|\n",
      "| AC|      Sena Madureira|  -9.0659556| -68.6571058|false|\n",
      "| AC|    Senador Guiomard| -10.1509675| -67.7360856|false|\n",
      "+---+--------------------+------------+------------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- uf: string (nullable = true)\n",
      " |-- cidade: string (nullable = true)\n",
      " |-- lat: string (nullable = true)\n",
      " |-- lng: string (nullable = true)\n",
      " |-- cond: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|              cidade|\n",
      "+--------------------+\n",
      "|          Acrelândia|\n",
      "|        Assis Brasil|\n",
      "|           Brasiléia|\n",
      "|              Bujari|\n",
      "|            Capixaba|\n",
      "|     Cruzeiro do Sul|\n",
      "|      Epitaciolândia|\n",
      "|               Feijó|\n",
      "|              Jordão|\n",
      "|         Mâncio Lima|\n",
      "|       Manoel Urbano|\n",
      "|Marechal Thaumaturgo|\n",
      "|   Plácido de Castro|\n",
      "|          Porto Acre|\n",
      "|        Porto Walter|\n",
      "|          Rio Branco|\n",
      "|     Rodrigues Alves|\n",
      "| Santa Rosa do Purus|\n",
      "|      Sena Madureira|\n",
      "|    Senador Guiomard|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Select only the \"name\" column\n",
    "df.select(\"cidade\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5563"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Name: org.apache.toree.interpreter.broker.BrokerException\n",
       "Message: Traceback (most recent call last):\n",
       "  File \"/var/folders/dv/zcjn7sq9799b_7vv7n926t0r0000gp/T/kernel-PySpark-044e9b24-b294-4f3f-a7c4-75af4a3df7df/pyspark_runner.py\", line 194, in <module>\n",
       "    eval(compiled_code)\n",
       "  File \"<string>\", line 1, in <module>\n",
       "  File \"/Applications/spark-2.2.0-bin-hadoop2.7/python/pyspark/sql/dataframe.py\", line 1020, in __getattr__\n",
       "    \"'%s' object has no attribute '%s'\" % (self.__class__.__name__, name))\n",
       "AttributeError: 'DataFrame' object has no attribute 'approx_count_distinct'\n",
       "\n",
       "StackTrace: org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)\n",
       "org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)\n",
       "scala.Option.foreach(Option.scala:257)\n",
       "org.apache.toree.interpreter.broker.BrokerState.markFailure(BrokerState.scala:162)\n",
       "sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n",
       "sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n",
       "sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n",
       "java.lang.reflect.Method.invoke(Method.java:497)\n",
       "py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n",
       "py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n",
       "py4j.Gateway.invoke(Gateway.java:280)\n",
       "py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n",
       "py4j.commands.CallCommand.execute(CallCommand.java:79)\n",
       "py4j.GatewayConnection.run(GatewayConnection.java:214)\n",
       "java.lang.Thread.run(Thread.java:745)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.approx_count_distinct(\"uf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----------------+----------+-----------+-----+\n",
      "| uf|           cidade|       lat|        lng| cond|\n",
      "+---+-----------------+----------+-----------+-----+\n",
      "| CE|          Abaiara| -7.345879|  -39.04159|false|\n",
      "| CE|          Acarape|  -4.22083| -38.705532|false|\n",
      "| CE|           Acaraú| -2.887689| -40.118306|false|\n",
      "| CE|         Acopiara| -6.089114| -39.448042|false|\n",
      "| CE|           Aiuaba|-6.5737801|-40.1269838|false|\n",
      "| CE|       Alcântaras| -3.585369| -40.547867|false|\n",
      "| CE|        Altaneira| -6.988047| -39.748055|false|\n",
      "| CE|       Alto Santo| -5.508941| -38.274318|false|\n",
      "| CE|         Amontada| -3.360166| -39.828816|false|\n",
      "| CE|Antonina do Norte| -6.769193|  -39.98697|false|\n",
      "| CE|         Apuiarés| -3.945057|  -39.43593|false|\n",
      "| CE|          Aquiraz|-3.9063524|-38.3877096|false|\n",
      "| CE|          Aracati| -4.558259| -37.767894|false|\n",
      "| CE|        Aracoiaba|  -4.36872| -38.812512|false|\n",
      "| CE|         Ararendá| -4.745667|  -40.83099|false|\n",
      "| CE|          Araripe| -7.213195| -40.135894|false|\n",
      "| CE|          Aratuba| -4.412291| -39.047117|false|\n",
      "| CE|         Arneiroz| -6.316499| -40.165285|false|\n",
      "| CE|           Assaré| -6.866897| -39.868872|false|\n",
      "| CE|           Aurora|         0|          0|false|\n",
      "+---+-----------------+----------+-----------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.filter(df['uf'] == \"CE\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-----+\n",
      "| uf|count|\n",
      "+---+-----+\n",
      "| SC|  293|\n",
      "| RO|   67|\n",
      "| PI|  224|\n",
      "| AM|   62|\n",
      "| GO|  246|\n",
      "| TO|  137|\n",
      "| MT|  141|\n",
      "| SP|  645|\n",
      "| ES|   78|\n",
      "| PB|  223|\n",
      "| RS|  496|\n",
      "| MS|   78|\n",
      "| AL|  102|\n",
      "| MG|  853|\n",
      "| PA|  143|\n",
      "| BA|  417|\n",
      "| SE|   75|\n",
      "| PE|  185|\n",
      "| CE|  184|\n",
      "| RN|  167|\n",
      "+---+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.groupBy(\"uf\").count().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Apache Toree - PySpark",
   "language": "python",
   "name": "apache_toree_pyspark"
  },
  "language_info": {
   "codemirror_mode": "text/x-ipython",
   "file_extension": ".py",
   "mimetype": "text/x-ipython",
   "name": "python",
   "pygments_lexer": "python",
   "version": "3.6.3\n"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
